suppressPackageStartupMessages({
library(data.table)
library(DESeq2)
library(gplots)
library(here)
library(hyperSpec)
library(parallel)
library(pander)
library(plotly)
library(RColorBrewer)
library(scatterplot3d)
library(tidyverse)
library(tximport)
library(vsn)
})
source(here("UPSCb-common/src/R/featureSelection.R"))
pal <- brewer.pal(8,"Dark2")
hpal <- colorRampPalette(c("blue","white","red"))(100)
mar <- par("mar")
samples <- read_delim(file = here("doc/Samples.tsv"),
delim="\t",
col_types=cols(
SampleID=col_factor(),
Time=col_factor(),
Replicate=col_factor())
) %>%
mutate(Time=relevel(Time,"std"))
filelist <- list.files(here("data/salmon"),
recursive = TRUE,
pattern = "quant.sf",
full.names = TRUE)
Sort the raw data and samples
samples <- samples[match(basename(dirname(filelist)),samples$SampleID),]
stopifnot(all(samples$SampleID == basename(dirname(filelist))))
name the file list vector
names(filelist) <- samples$SampleID
Read the expression at the transcript level (we have no gene information)
counts <- suppressMessages(round(tximport(files = filelist,
type = "salmon",txOut = TRUE)$counts))
Read the algae IDs
IDs <- scan(here("data/analysis/annotation/algae-IDs.txt"),
what = "character")
Subset the data
counts <- counts[IDs,]
sel <- rowSums(counts) == 0
sprintf("%s%% percent (%s) of %s genes are not expressed",
round(sum(sel) * 100/ nrow(counts),digits=1),
sum(sel),
nrow(counts))
## [1] "4.5% percent (2244) of 49477 genes are not expressed"
dat <- tibble(x=colnames(counts),y=colSums(counts)) %>%
bind_cols(samples)
ggplot(dat,aes(x,y,fill=Time)) + geom_col() +
scale_y_continuous(name="reads") +
theme(axis.text.x=element_text(angle=90,size=4),
axis.title.x=element_blank())
i.e. the mean raw count of every gene across samples is calculated and displayed on a log10 scale.
The cumulative gene coverage is as expected
ggplot(data.frame(value=log10(rowMeans(counts))),aes(x=value)) +
geom_density() + ggtitle("gene mean raw counts distribution") +
scale_x_continuous(name="mean raw counts (log10)")
## Warning: Removed 2244 rows containing non-finite values (stat_density).
The same is done for the individual samples colored by Time
dat <- as.data.frame(log10(counts)) %>% utils::stack() %>%
mutate(Time=samples$Time[match(ind,samples$SampleID)])
ggplot(dat,aes(x=values,group=ind,col=Time)) +
geom_density() + ggtitle("sample raw counts distribution") +
scale_x_continuous(name="per gene raw counts (log10)")
## Warning: Removed 415722 rows containing non-finite values (stat_density).
dir.create(here("data/analysis/salmon"),showWarnings=FALSE,recursive=TRUE)
write.csv(counts,file=here("data/analysis/salmon/raw-unormalised-gene-expression_data.csv"))
For visualization, the data is submitted to a variance stabilization transformation using DESeq2. The dispersion is estimated independently of the sample tissue and replicate.
dds <- DESeqDataSetFromMatrix(
countData = counts,
colData = samples,
design = ~ Time)
## converting counts to integer mode
save(dds,file=here("data/analysis/salmon/dds.rda"))
Check the size factors (i.e. the sequencing library size effect)
dds <- estimateSizeFactors(dds)
sizes <- sizeFactors(dds)
pander(sizes)
| B2_2_120hrs_A | B2_2_120hrs_B | B2_2_120hrs_C | B2_2_120hrs_D | B2_2_12hrs_A |
|---|---|---|---|---|
| 1.122 | 1.115 | 1.211 | 1.009 | 0.9364 |
| B2_2_12hrs_B | B2_2_12hrs_C | B2_2_12hrs_D | B2_2_24hrs_A | B2_2_24hrs_B |
|---|---|---|---|---|
| 0.9092 | 0.85 | 1.327 | 1.076 | 1.287 |
| B2_2_24hrs_C | B2_2_24hrs_D | B2_2_4hrs_A | B2_2_4hrs_B | B2_2_4hrs_C |
|---|---|---|---|---|
| 1.162 | 0.9715 | 0.7666 | 0.681 | 0.757 |
| B2_2_4hrs_D | B2_2_60minA | B2_2_60minB | B2_2_60minC | B2_2_60minD |
|---|---|---|---|---|
| 0.7047 | 1.03 | 1.327 | 1.128 | 1.239 |
| B2_2_72hrs_A | B2_2_72hrs_B | B2_2_72hrs_C | B2_2_72hrs_D | B2_2_std_A |
|---|---|---|---|---|
| 1.188 | 1.236 | 0.8674 | 0.9216 | 1.238 |
| B2_2_std_B | B2_2_std_C | B2_2_std_D |
|---|---|---|
| 1.094 | 1.088 | 1.325 |
boxplot(sizes, main="Sequencing libraries size factor")
vsd <- varianceStabilizingTransformation(dds, blind=TRUE)
vst <- assay(vsd)
vst <- vst - min(vst)
The variance stabilisation worked adequately
meanSdPlot(vst[rowSums(vst)>0,])
pc <- prcomp(t(vst))
percent <- round(summary(pc)$importance[2,]*100)
We define the number of variable of the model
nvar=1
An the number of possible combinations
nlevel=nlevels(dds$Time)
We plot the percentage explained by the different components, the red line represent the number of variable in the model, the orange line the number of variable combinations.
ggplot(tibble(x=1:length(percent),y=cumsum(percent)),aes(x=x,y=y)) +
geom_line() + scale_y_continuous("variance explained (%)",limits=c(0,100)) +
scale_x_continuous("Principal component") +
geom_vline(xintercept=nvar,colour="red",linetype="dashed",size=0.5) +
geom_hline(yintercept=cumsum(percent)[nvar],colour="red",linetype="dashed",size=0.5) +
geom_vline(xintercept=nlevel,colour="orange",linetype="dashed",size=0.5) +
geom_hline(yintercept=cumsum(percent)[nlevel],colour="orange",linetype="dashed",size=0.5)
mar=c(5.1,4.1,4.1,2.1)
The PCA shows that a large fraction of the variance is explained by both variables.
scatterplot3d(pc$x[,1],
pc$x[,2],
pc$x[,3],
xlab=paste("Comp. 1 (",percent[1],"%)",sep=""),
ylab=paste("Comp. 2 (",percent[2],"%)",sep=""),
zlab=paste("Comp. 3 (",percent[3],"%)",sep=""),
color=pal[as.integer(dds$Time)],pch=19)
legend("topleft",
fill=pal[1:nlevels(dds$Time)],
legend=levels(dds$Time))
par(mar=mar)
pc.dat <- bind_cols(PC1=pc$x[,1],
PC2=pc$x[,2],
samples)
p <- ggplot(pc.dat,aes(x=PC1,y=PC2,col=Time,shape=Replicate,text=SampleID)) +
geom_point(size=2) +
ggtitle("Principal Component Analysis",subtitle="variance stabilized counts")
ggplotly(p) %>%
layout(xaxis=list(title=paste("PC1 (",percent[1],"%)",sep="")),
yaxis=list(title=paste("PC2 (",percent[2],"%)",sep="")))
Filter for noise
sels <- rangeFeatureSelect(counts=vst,
conditions=dds$Time,
nrep=3)
vst.cutoff <- 3
hm <- heatmap.2(t(scale(t(vst[sels[[vst.cutoff+1]],]))),
distfun=pearson.dist,
hclustfun=function(X){hclust(X,method="ward.D2")},
labRow = NA,trace = "none",
labCol = dds$Time,
col=hpal)
plot(as.hclust(hm$colDendrogram),xlab="",sub="")
The data looks good. Clearly, there has been a sample swap between B2_2_24hrs_D and B2_2_12hrs_D.
D12p <- which(dds$SampleID == "B2_2_12hrs_D")
D24p <- which(dds$SampleID == "B2_2_24hrs_D")
colData(dds)[D12p,"Time"] <- "24hrs"
colData(dds)[D24p,"Time"] <- "12hrs"
samples[D12p,"Time"] <- "24hrs"
samples[D24p,"Time"] <- "12hrs"
save(dds,file=here("data/analysis/salmon/dds-sample-swap-corrected.rda"))
write_tsv(samples,path = here("doc/Samples-swap-corrected.tsv"))
## R version 3.6.1 (2019-07-05)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 18.04.3 LTS
##
## Matrix products: default
## BLAS/LAPACK: /usr/lib/x86_64-linux-gnu/libopenblasp-r0.2.20.so
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] grid parallel stats4 stats graphics grDevices utils
## [8] datasets methods base
##
## other attached packages:
## [1] vsn_3.54.0 tximport_1.14.0
## [3] forcats_0.4.0 stringr_1.4.0
## [5] dplyr_0.8.3 purrr_0.3.3
## [7] readr_1.3.1 tidyr_1.0.0
## [9] tibble_2.1.3 tidyverse_1.3.0
## [11] scatterplot3d_0.3-41 RColorBrewer_1.1-2
## [13] plotly_4.9.1 pander_0.6.3
## [15] hyperSpec_0.99-20180627 ggplot2_3.2.1
## [17] lattice_0.20-38 here_0.1
## [19] gplots_3.0.1.1 DESeq2_1.26.0
## [21] SummarizedExperiment_1.16.0 DelayedArray_0.12.0
## [23] BiocParallel_1.20.0 matrixStats_0.55.0
## [25] Biobase_2.46.0 GenomicRanges_1.38.0
## [27] GenomeInfoDb_1.22.0 IRanges_2.20.1
## [29] S4Vectors_0.24.0 BiocGenerics_0.32.0
## [31] data.table_1.12.6
##
## loaded via a namespace (and not attached):
## [1] colorspace_1.4-1 rprojroot_1.3-2 htmlTable_1.13.2
## [4] XVector_0.26.0 base64enc_0.1-3 fs_1.3.1
## [7] rstudioapi_0.10 hexbin_1.28.0 farver_2.0.1
## [10] affyio_1.56.0 bit64_0.9-7 AnnotationDbi_1.48.0
## [13] lubridate_1.7.4 xml2_1.2.2 splines_3.6.1
## [16] geneplotter_1.64.0 knitr_1.26 zeallot_0.1.0
## [19] Formula_1.2-3 jsonlite_1.6 broom_0.5.2
## [22] annotate_1.64.0 cluster_2.1.0 dbplyr_1.4.2
## [25] shiny_1.4.0 BiocManager_1.30.10 compiler_3.6.1
## [28] httr_1.4.1 backports_1.1.5 fastmap_1.0.1
## [31] assertthat_0.2.1 Matrix_1.2-18 lazyeval_0.2.2
## [34] limma_3.42.0 cli_1.1.0 later_1.0.0
## [37] acepack_1.4.1 htmltools_0.4.0 tools_3.6.1
## [40] affy_1.64.0 gtable_0.3.0 glue_1.3.1
## [43] GenomeInfoDbData_1.2.2 Rcpp_1.0.3 cellranger_1.1.0
## [46] vctrs_0.2.0 preprocessCore_1.48.0 gdata_2.18.0
## [49] nlme_3.1-142 crosstalk_1.0.0 xfun_0.11
## [52] rvest_0.3.5 testthat_2.3.0 mime_0.7
## [55] lifecycle_0.1.0 gtools_3.8.1 XML_3.98-1.20
## [58] zlibbioc_1.32.0 scales_1.1.0 promises_1.1.0
## [61] hms_0.5.2 yaml_2.2.0 memoise_1.1.0
## [64] gridExtra_2.3 rpart_4.1-15 latticeExtra_0.6-28
## [67] stringi_1.4.3 RSQLite_2.1.2 highr_0.8
## [70] genefilter_1.68.0 checkmate_1.9.4 caTools_1.17.1.2
## [73] rlang_0.4.2 pkgconfig_2.0.3 bitops_1.0-6
## [76] evaluate_0.14 labeling_0.3 htmlwidgets_1.5.1
## [79] bit_1.1-14 tidyselect_0.2.5 magrittr_1.5
## [82] R6_2.4.1 generics_0.0.2 Hmisc_4.3-0
## [85] DBI_1.0.0 pillar_1.4.2 haven_2.2.0
## [88] foreign_0.8-72 withr_2.1.2 survival_3.1-7
## [91] RCurl_1.95-4.12 nnet_7.3-12 modelr_0.1.5
## [94] crayon_1.3.4 KernSmooth_2.23-16 rmarkdown_1.18
## [97] locfit_1.5-9.1 readxl_1.3.1 blob_1.2.0
## [100] reprex_0.3.0 digest_0.6.23 xtable_1.8-4
## [103] httpuv_1.5.2 munsell_0.5.0 viridisLite_0.3.0